package com.offcn.bigdata.spark.sql.p1

import org.apache.spark.sql.{Column, DataFrame, SparkSession}

/**
  * sparksql的编程入门体验
  *  2.0以前的版本入口：
  *     SQLContext    是一个通用的编程入口
  *     HiveContext   可以集成Hive诸多特性的编程入口
  *  2.0之后便统一了编程入口：
  *     SparkSession
  *
  */
object _01SparkSQLOps {
    def main(args: Array[String]): Unit = {
        val spark = SparkSession.builder()
                        .appName(s"${_01SparkSQLOps.getClass.getSimpleName}")
                        .master("local[*]")
//                        .enableHiveSupport()//支持hive的特有特性操作
                        .getOrCreate()
        //加载数据
        val pdf: DataFrame = spark.read.json("file:/E:/data/spark/sql/people.json")
        //打印当前表的元数据信息
        pdf.printSchema()
        //查看表中的内容
        pdf.show()
        //select name, age, height from xxx
        println("select----->")
        pdf.select("name", "age", "height").show()
        println("select--col--->")
        pdf.select(new Column("name"), new Column("age")).show
        println("select--col2--->")
        import spark.implicits._ //使用下面的表达式操作，必须要引入该隐式转换
        pdf.select($"name", $"age", $"height").show()

        println("select--运算操作--->")

        pdf.select(new Column("name"), new Column("age").+(1).as("age"),
            new Column("height").-(1).as("height")).show
        println("select--运算操作,基于表达式--->")
        pdf.select($"name", ($"age" + 1).as("age"), ($"height" + 1).as("height")).show()

        println("select--条件查询--->")
        //select * from xxx where province = "福建" and age > 15
        pdf.select("name", "age", "height", "province")
//                .where("age > 15")
//                .where("province = '福建'")
                .where("age > 15 and province = '福建'")
                .show()

        println("select--聚合统计--->")
        //select province, count(1) as count from Xxx group by province
        pdf.select("province").groupBy("province").count().show()
        //select province, max(age), max(height), count(1) from Xxx group by province
        pdf.select("province", "age", "height").groupBy("province")
                .agg(Map(
                    "age" -> "max",
                    "height" -> "max",
                    "province" -> "count"
                )).show()
        //把上述的操作称之为sparkSQL编程的DSL(domain special language)方式
        //将pdf dataframe/dataset对象注册成一张表
        pdf.createOrReplaceTempView("people")
        spark.sql(
            s"""
              |select
              |  province,
              |  max(age) maxAge,
              |  max(height) maxHeight,
              |  count(1) as count
              |from people
              |group by province
            """.stripMargin)
            .show()
        spark.stop()
    }
}
